# TwoSampleTestR.r
# Randomization test for difference between two means.
# I will begin using the mean difference as my stsatistic,
# and then move to using the t statistic for the same purpose.
# When we pool the variances they should give the same result.

NoWaiting <- c(36.30, 42.07, 39.97, 39.33, 33.76, 33.91, 39.65, 84.92, 40.70, 39.65,
39.48, 35.38, 75.07, 36.46, 38.73, 33.88, 34.39, 60.52, 53.63 )

Waiting <- c(49.48, 43.30, 85.97, 46.92, 49.18, 79.30, 47.35, 46.52, 59.68, 42.89,
49.29, 68.69, 41.61, 46.81, 43.75, 46.55, 42.33, 71.48, 78.95)
n1 <- length(NoWaiting)
n2 <- length(Waiting)
N <- n1 + n2
meanNW <- mean(NoWaiting) ; meanW <- mean(Waiting) 
diffObt <- (meanW - meanNW)
Combined <- c(NoWaiting, Waiting)     # Combining the samples
    #Now pool the variances
s2p <- (var(NoWaiting)*(n1-1) + var(Waiting)*(n2-1))/(n1 + n2 - 2)
    # Write out the results
cat("The obtained difference between the means is = ",diffObt, '\n')
cat("The pooled variance is = ", s2p, '\n')
cat("Now we will run a standard parametric t test \n")
ttest <- t.test(Waiting, NoWaiting)
cat("The resulting t test on raw data is = ", ttest$statistic, '\n')
print(ttest)

tObt <-  ttest$statistic
nreps <- 5000             # I am using 5000 resamplings of the data
meanDiff <- numeric(nreps)   #Setting up arrays to hold the results
t <- numeric(nreps)
set.seed(1086)
for ( i in 1:nreps) {
      data <- sample(Combined, N,  replace = FALSE)
      grp1 <- data[1:n1]
      grp2 <- na.omit(data[n1+1: N])
      meanDiff[i] <- mean(grp1) - mean(grp2)
      test <- t.test(grp1, grp2)
      t[i] <- test$statistic
      }

# Just to demonstrate the equivalence of mean differences and t
cat("The correlation between mean differences and t is = ",'\n')
print(cor(meanDiff, t))
cat('\n')

# Rather than incrementing a counter as I go along, I am counting at the end.
cat("The number of times when the absolute mean differences exceeded diffObt = ",'\n')
absMeanDiff <- abs(meanDiff)
absDiffObt = abs(diffObt)
print(length(absMeanDiff[absMeanDiff >= absDiffObt]))
cat("The number of times when the t values exceeded the obtained t = ",'\n')
abst <- abs(t)
abstObt <- abs(tObt)
print(length(abst[abst >= abstObt]))
cat("The proportion of resamplings when each the mean diff or t exceeded the 
obtained value = ",'\n')
print (length(abs(absMeanDiff[absMeanDiff >= absDiffObt]))/nreps)
cat('\n')

par(mfrow = c(2,2))           # Set up graphic surface
hist(meanDiff, breaks = 25, ylim = c(0,425))

hist(t, breaks = 50, xlim = c(-5,5), ylim = c(0,425))

#  The following plots the empirical distribution of t from resampling
#     against the actual Student's t distribution on 36 df.
#     Notice the similarity even though the data are badly skewed.
hist(t, freq = FALSE, xlim = c(-5,5), ylim = c(0,.4 ), col = "lightblue",
ylab = "density", xlab = "t", main = "")
par(new = T)
den <- seq(-5, 4.998, .002)
tdens <- dt(den,36)
plot(den, tdens, type = "l", ylim = c(0,.4), xlim = c(-5,5),
  col = "red", ylab = "", xlab = "", main = "Student t and resample t")


# dch   12/3/2013